{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Creating binary features through thresholding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.datasets import load_boston\n",
    " \n",
    "boston = load_boston()\n",
    "X, y = boston.data, boston.target.reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.],\n",
       "       [ 0.],\n",
       "       [ 1.],\n",
       "       [ 1.],\n",
       "       [ 1.]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "new_target = preprocessing.binarize(y,threshold=boston.target.mean())\n",
    "new_target[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1],\n",
       "       [0],\n",
       "       [1],\n",
       "       [1],\n",
       "       [1]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(y[:5] > y.mean()).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.],\n",
       "       [ 0.],\n",
       "       [ 1.],\n",
       "       [ 1.],\n",
       "       [ 1.]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "binar = preprocessing.Binarizer(y.mean())\n",
    "new_target = binar.fit_transform(y)\n",
    "new_target[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Cannot binarize a sparse matrix with threshold < 0",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-5-c9b5156c63ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msparse\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mcoo\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mspar\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcoo\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcoo_matrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbinomial\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m.25\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mpreprocessing\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbinarize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mspar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthreshold\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mC:\\Users\\jdavila\\Anaconda27\\lib\\site-packages\\sklearn\\preprocessing\\data.pyc\u001b[0m in \u001b[0;36mbinarize\u001b[1;34m(X, threshold, copy)\u001b[0m\n\u001b[0;32m   1561\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0msparse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0missparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1562\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mthreshold\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1563\u001b[1;33m             raise ValueError('Cannot binarize a sparse matrix with threshold '\n\u001b[0m\u001b[0;32m   1564\u001b[0m                              '< 0')\n\u001b[0;32m   1565\u001b[0m         \u001b[0mcond\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m \u001b[1;33m>\u001b[0m \u001b[0mthreshold\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: Cannot binarize a sparse matrix with threshold < 0"
     ]
    }
   ],
   "source": [
    "from scipy.sparse import coo\n",
    "spar = coo.coo_matrix(np.random.binomial(1, .25, 100))\n",
    "preprocessing.binarize(spar, threshold=-1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
